Introduction

Preface

This is part two in a look at gun violence mass shootings in the US. In this analysis, we will look at the shooting incident data geographically.

Setup

Load Libraries

library(tidyverse)
library(tidymodels)
library(tidytext)
library(lubridate)
library(scales)
library(janitor)
library(modeltime)
library(tictoc)
library(future)
library(doFuture)
library(tidyquant)
library(timetk)
library(thief)

registerDoFuture()

n_cores <- parallel::detectCores()

plan(strategy = cluster,
     workers  = parallel::makeCluster(n_cores))

Source the data

Read in the saved data file.

complete_tbl <- read_rds("gva_data.rds") 
top_nine_states <- read_csv("top_nine_states.csv") %>% 
    pull(state)
## Rows: 9 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, state_code
## dbl (4): n_incidents, n_deaths, n_injuries, incidents_rank
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Here I prepare the primary data set to include some date-related fields for convenience when plotting and summarizing the data.

ms_tbl <- complete_tbl %>% 
    janitor::clean_names() %>% 
    select(-operations) %>% 
    mutate(incident_date = mdy(incident_date),
           year = year(incident_date),
           month = month(incident_date, label = T, abbr = T),
           incident_month = rollforward(incident_date),
           dow = wday(incident_date, label = TRUE)) %>% 
    arrange(incident_date)

Here I create a function to group and summarize the data in different ways since we will need to do this many times later.

group_function <- function(df, ...) {
    
    output <- df %>% 
        group_by(...) %>% 
        summarize(n_incidents = n(),
                  n_deaths = sum(number_killed, na.rm = T),
                  n_injuries = sum(number_injured, na.rm = T),
                  .groups = "drop")
    
    return(output)
}

Nesting the time series data

future_date <- 12

# ms_model_data <- ms_tbl %>%
#     filter(incident_date < floor_date(Sys.Date(), "month"),
#            year(incident_date) >= 2020) %>%
#     group_function(incident_month) %>%
#     select(-n_injuries, -n_deaths) %>%
#     mutate(state = "us")
#     #group_by(state) %>%
#     #filter(n() > 10) %>%
#     #ungroup()

ms_model_state_data <- ms_tbl %>%
    filter(incident_date < floor_date(Sys.Date(), "month"),
           year(incident_date) >= 2020) %>%
    group_function(state, incident_month) %>%
    select(-n_injuries, -n_deaths) %>%
    complete(state, incident_month, fill = list(n_incidents = 0)) %>%
    filter(state %in% top_nine_states)

nested_data_tbl <- ms_model_state_data %>% #ms_model_data %>%
    group_by(state) %>%
    modeltime::extend_timeseries(.id_var = state,
                                 .date_var = incident_month,
                                 .length_future = future_date) %>%
    modeltime::nest_timeseries(.id_var = state,
                    .length_future = future_date) %>%
    modeltime::split_nested_timeseries(.length_test = future_date)

Models

# # MODELING ----
# # * XGBoost Recipe ----
rec_xgb <- recipe(n_incidents ~ ., extract_nested_train_split(nested_data_tbl)) %>%
    step_timeseries_signature(incident_month) %>%
    step_rm(incident_month) %>%
    step_zv(all_predictors()) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE)

rec_arima <- recipe(n_incidents ~ ., extract_nested_train_split(nested_data_tbl)) %>%
    step_timeseries_signature(incident_month) %>%
    step_zv(all_predictors()) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE)

# * XGBoost Models ----

wflw_xgb_1 <- workflow() %>%
    add_model(boost_tree("regression", learn_rate = 0.35) %>% set_engine("xgboost")) %>%
    add_recipe(rec_xgb)

wflw_xgb_2 <- workflow() %>%
    add_model(boost_tree("regression", learn_rate = 0.50) %>% set_engine("xgboost")) %>%
    add_recipe(rec_xgb)

# * RandForest Model ----
wflw_rf_1 <- workflow() %>%
    add_model(rand_forest("regression") %>% set_engine("ranger")) %>%
    add_recipe(rec_xgb)

# * ARIMA Boost
wflw_arima_1 <- workflow() %>%
    add_model(arima_boost("regression") %>% set_engine("arima_xgboost")) %>%
    add_recipe(rec_arima)

# *  Prophet Boost
wflw_prophet_1 <- workflow() %>%
    add_model(prophet_boost("regression") %>% set_engine("prophet_xgboost")) %>%
    add_recipe(rec_arima)

Test run and error checking

# # 1.0 TRY 1 TIME SERIES ----
# #   - Tells us if our models work at least once (before we scale)
try_sample_tbl <- nested_data_tbl %>%
    slice(1) %>%
    modeltime_nested_fit(
        model_list = list(
            wflw_xgb_1,
            wflw_xgb_2,
            wflw_rf_1,
            wflw_arima_1,
            #wflw_ets_1,
            wflw_prophet_1),
        control = control_nested_fit(
            verbose   = TRUE,
            allow_par = FALSE))
## ℹ [1/1] Starting Modeltime Table: ID California...
## ✔ Model 1 Passed BOOST_TREE.
## ✔ Model 2 Passed BOOST_TREE.
## ✔ Model 3 Passed RAND_FOREST.
## ✔ Model 4 Passed ARIMA_BOOST.
## ✔ Model 5 Passed PROPHET_BOOST.
## ✔ [1/1] Finished Modeltime Table: ID California
## Finished in: 7.511683 secs.
try_sample_tbl
# * Check Errors ----
try_sample_tbl %>% extract_nested_error_report()

Scale up and process models

# # 2.0 SCALE ----
parallel_start(16)

nested_modeltime_tbl <- nested_data_tbl %>%
    modeltime_nested_fit(
        model_list = list(
            wflw_xgb_1,
            wflw_xgb_2,
            wflw_rf_1,
            wflw_arima_1,
            #wflw_ets_1,
            wflw_prophet_1),
        control = control_nested_fit(
            verbose   = TRUE,
            allow_par = TRUE))
## Using existing parallel backend with 16 clusters (cores)...
##  Beginning Parallel Loop | 0.011 seconds
##  Finishing parallel backend. Clusters are remaining open. | 24.034 seconds
##  Close clusters by running: `parallel_stop()`.
## Finished in: 24.03425 secs.

Error review

# # * Review Any Errors ----
nested_modeltime_tbl %>% extract_nested_error_report()

Accuracy review

# # * Review Test Accuracy ----
nested_modeltime_tbl %>%
    extract_nested_test_accuracy() %>%
    table_modeltime_accuracy()

Test Forecast against actuals

# # * Visualize Test Forecast ----
nested_modeltime_tbl %>%
    extract_nested_test_forecast() %>%
    group_by(state) %>%
    plot_modeltime_forecast(.facet_ncol = 3)

Select best models

# # 3.0 SELECT BEST ----
nested_best_tbl <- nested_modeltime_tbl %>%
    modeltime_nested_select_best(metric = "mae", minimize = TRUE)

Visualize best models

# # * Visualize Best Models ----
nested_best_tbl %>%
    extract_nested_test_forecast() %>%
    group_by(state) %>%
    plot_modeltime_forecast(.facet_ncol = 3)

Model refits

# # 4.0 REFIT ----
nested_best_refit_tbl <- nested_best_tbl %>%
    modeltime_nested_refit(
        control = control_refit(
            verbose   = TRUE,
            allow_par = TRUE))
## Using existing parallel backend with 16 clusters (cores)...
##  Beginning Parallel Loop | 0.004 seconds
##  Finishing parallel backend. Clusters are remaining open. | 0.996 seconds
##  Close clusters by running: `parallel_stop()`.
## Finished in: 0.996784 secs.
parallel_stop()

Final error review

# # * Review Any Errors ----
nested_best_refit_tbl %>% extract_nested_error_report()

Visualize future forecast

# # * Visualize Future Forecast ----
nested_best_refit_tbl %>%
    extract_nested_future_forecast() %>%
    mutate(.conf_lo = case_when(.conf_lo < 0 ~ 0,
                                TRUE ~ .conf_lo)) %>% 
    group_by(state) %>%
    plot_modeltime_forecast(.facet_ncol = 3)